####################################
#Diff-in-diff with matching and dual-distance cutoffs - NJ+PA figure, dd with dd 
#Traffic and Political Attitudes
#Connor Jerzak & Brian Libgober
####################################
library(ggplot2)
setwd("~/Dropbox/Collaborations/Traffic/project_analysis/")

#load in data 
NJ_data <- read.csv("NJ_precinct_master.csv", header=T)
PA_data <- read.csv("PA_precinct_master.csv", header=T)
colnames(PA_data)[which(colnames(PA_data)=="EZPlaza_TravelDistOptimizedDist_PA_Total_Leng")] <- "EZPlaza_TravelDistOptimizedDist_Total_Leng"
colnames(NJ_data)[which(colnames(NJ_data)=="EZPlaza_TravelDistOptimizedDist_NJ_Total_Leng")] <- "EZPlaza_TravelDistOptimizedDist_Total_Leng"

#a bit of cleaning 
colnames(NJ_data)[which(colnames(NJ_data)=="USP_00")] <- "USPP2000"
colnames(NJ_data)[which(colnames(NJ_data)=="USP_04")] <- "USPP2004"

#create new variables 
PA_data$census_2000_pabove125 <- as.numeric(as.character(PA_data$census_2000_125to_150)) + 
         as.numeric(as.character(PA_data$census_2000_150to_200)) +
          as.numeric(as.character(PA_data$census_2000_200over))

PA_data$census_2009_pabove125 <-  as.numeric(as.character(PA_data$census_2009_125to_150)) + 
        as.numeric(as.character(PA_data$census_2009_150to_200)) + 
        as.numeric(as.character(PA_data$census_2009_200over))

NJ_data$census_2000_pabove125 <- as.numeric(as.character(NJ_data$census_2000_125to_150)) + 
        as.numeric(as.character(NJ_data$census_2000_150to_200)) +
        as.numeric(as.character(NJ_data$census_2000_200over))

NJ_data$census_2009_pabove125 <-  as.numeric(as.character(NJ_data$census_2009_125to_150)) + 
        as.numeric(as.character(NJ_data$census_2009_150to_200)) + 
        as.numeric(as.character(NJ_data$census_2009_200over))

#define outcomes 
outcome_time1_name <- "USPP2000"
outcome_time2_name <- "USPP2004"  #US president (dem) percent 2000, 2004

#define identify matching variables 
matching_variables <- c("census_2000_pblack", 
                        "census_2000_pfemale", 
                        "census_2000_pfam_belowpoverty", 
                        "census_2000_ppop_over65", 
                        "census_2000_pabove125") 

#define treated basis (something having to do with E-ZPasses)
treatment_basis_name <- "EZPlaza_TravelDistOptimizedDist_Total_Leng"

#define control basis (something having to do with non-E-ZPass highway)
control_basis_name <- c("minNetworkOptimizedDistance_Distance_nonEZ_hwy")

#define other variables to include in the regression
fixed_effect_names <- c("COUNTYFP10", 
                        "STATEFP10")

#NJ 
NJ_data_reduced <-  NJ_data[, c(treatment_basis_name, control_basis_name, fixed_effect_names, 
                               matching_variables, outcome_time1_name, outcome_time2_name) ] 
NJ_data_reduced <- as.data.frame(
                          cbind(apply(X=NJ_data_reduced[,which(!(colnames(NJ_data_reduced) %in% c("COUNTYFP10", "STATEFP10"))) ], 
                                      MARGIN=2, 
                                      FUN=as.numeric),
                          data.frame(NJ_data_reduced$COUNTYFP10, NJ_data_reduced$STATEFP10) ) )
colnames(NJ_data_reduced)[ncol(NJ_data_reduced)-1] <- "COUNTYFP10"
colnames(NJ_data_reduced)[ncol(NJ_data_reduced)] <- "STATEFP10"

#PA 
PA_data_reduced <- PA_data[, c(treatment_basis_name, control_basis_name, fixed_effect_names, 
                               matching_variables, outcome_time1_name, outcome_time2_name) ]
PA_data_reduced <- as.data.frame(
                        cbind(apply(X=PA_data_reduced[,which(!(colnames(PA_data_reduced) %in% c("COUNTYFP10", "STATEFP10"))) ], 
                              MARGIN=2, 
                              FUN=as.numeric),
                        data.frame(PA_data_reduced$COUNTYFP10, PA_data_reduced$STATEFP10) ) )
colnames(PA_data_reduced)[ncol(PA_data_reduced)-1] <- "COUNTYFP10"
colnames(PA_data_reduced)[ncol(PA_data_reduced)] <- "STATEFP10"

#combine 
my_data <- rbind(NJ_data_reduced, PA_data_reduced)
use_index1 <- which( (!(is.na(my_data$COUNTYFP10))) & (my_data$STATEFP10==34) )  
use_index2 <- which( (!(is.na(my_data$COUNTYFP10))) & (my_data$STATEFP10==42) )  
my_data[use_index1,]$COUNTYFP10 <- paste("state1_", my_data[use_index1, ]$COUNTYFP10, sep="")
my_data[use_index2,]$COUNTYFP10 <- paste("state2_", my_data[use_index2,]$COUNTYFP10, sep="") 

#remove rows with NAs 
my_data <- na.omit(my_data)


##############################
#loop to generate figures
##############################
##############################
max_length_out <- 5; min_length_out <- 5;

max_seq <- seq(2, 30, length.out = max_length_out)# max_seq <- seq(5, 14, length.out = max_length_out)
min_seq <- seq(2, 50, length.out = min_length_out)# min_seq <- seq(14, 36.7, length.out = min_length_out)

source("./causal_effect_estimation/dd_with_linearSDs_function.R")
BIG_STORAGE_LIST <- list()
counter1 <- 0 
for(j in min_seq)
{ 
  counter1 <- counter1 + 1 
  results_list <- list(); counter2 <- 0; for(i in max_seq)
  { 
    counter2 <- counter2 + 1
    print(counter2)
    results_list[[counter2]] <- dd_with_linearSDs(my_data=my_data,
                                                  treatment_basis_name=treatment_basis_name, 
                                                  control_basis_name=control_basis_name, 
                                                  outcome_time1_name=outcome_time1_name, 
                                                  outcome_time2_name=outcome_time2_name,
                                                  treatment_basis_treated_max=i, 
                                                  treatment_basis_control_min=j, 
                                                  control_basis_control_max=i,
                                                  type="binary_fixed_effects",
                                                  fixed_effect_names = fixed_effect_names,  
                                                  control_basis_treated_min=j,
                                                  match=T,
                                                  my_replace=F, 
                                                  start_browser=F)
  
  }
  BIG_STORAGE_LIST[[counter1]] <- results_list
}

####################
#save and plot results
####################
#save results as appropriate objects 
results_list <- unlist(BIG_STORAGE_LIST, recursive=F, use.names=T)
match_name <- "linearSDs_JOINT"
index_seq <- seq(1, max_length_out*min_length_out)
for(index in 1:min_length_out)
{
  start <- max_length_out*(index-1)+1
  end <- max_length_out*(index)
  
  internal_list <- results_list[start:end]
  
  #save results as appropriate objects 
  unlisted_results <- unlist(internal_list, recursive=T, use.names=T) #$base_result
  base_results <- unlisted_results[which(names(unlisted_results)=="lm_summary.Estimate")]
  SDs <- unlisted_results[which(names(unlisted_results)=="lm_summary.Std. Error")]
  
  #setup for ggplot 2 
  gg_data <- data.frame(cbind(max_seq, base_results, base_results-1.96*SDs, base_results+1.96*SDs))
  colnames(gg_data)[ncol(gg_data)-1] <- "lower"
  colnames(gg_data)[ncol(gg_data)] <- "upper"
  
  g <- ggplot(gg_data, aes(x=max_seq, y=base_results)) + 
    geom_point() + 
    geom_errorbar(aes(ymin=lower, ymax=upper), width=0.2) + 
    geom_hline(yintercept = 0, line_type="dashed") + 
    geom_errorbar(aes(ymin=lower, ymax=upper), width=0.2) + 
    ylab("DiD effect (in % points)") + 
    xlab("Maximum distance from E-ZPass or Control Highway") + 
    ggtitle("Maximum Distance vs. DiD Effect (in % points)") + 
    theme(text = element_text(size=13)) 
  ggsave(g, file=paste(match_name, sprintf("%i.png", index), sep=""))
}




